# Loading libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import plotly.express as px
# importing the dataframe from the data file
diamonds_df = pd.read_csv("/workspaces/portfolio/diamonds.csv")
#printing the dataframe
diamonds_df
# from printing the dataframe it is clear thet we have
# 10 Columns represents the specifications of the diamonds
# 53940 rows, each row represents a Diamond
| carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53935 | 0.72 | Ideal | D | SI1 | 60.8 | 57.0 | 2757 | 5.75 | 5.76 | 3.50 |
| 53936 | 0.72 | Good | D | SI1 | 63.1 | 55.0 | 2757 | 5.69 | 5.75 | 3.61 |
| 53937 | 0.70 | Very Good | D | SI1 | 62.8 | 60.0 | 2757 | 5.66 | 5.68 | 3.56 |
| 53938 | 0.86 | Premium | H | SI2 | 61.0 | 58.0 | 2757 | 6.15 | 6.12 | 3.74 |
| 53939 | 0.75 | Ideal | D | SI2 | 62.2 | 55.0 | 2757 | 5.83 | 5.87 | 3.64 |
53940 rows × 10 columns
#information summary for the dataframe
diamonds_df.info()
# from printing the dataframe info it is clear that
# there are no missing values
# the columns carat, depth, table, x, y, and z are represented as a float
# the columns cut, color, and clarity are represented as an object
# the column price is the onlu column represented as an integer
<class 'pandas.core.frame.DataFrame'> RangeIndex: 53940 entries, 0 to 53939 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 carat 53940 non-null float64 1 cut 53940 non-null object 2 color 53940 non-null object 3 clarity 53940 non-null object 4 depth 53940 non-null float64 5 table 53940 non-null float64 6 price 53940 non-null int64 7 x 53940 non-null float64 8 y 53940 non-null float64 9 z 53940 non-null float64 dtypes: float64(6), int64(1), object(3) memory usage: 4.1+ MB
# dataframe statistical summary
diamonds_df.describe()
#the dataframe discribtion helps to
# see the mean, std, count, and range of each column
| carat | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|
| count | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 |
| mean | 0.797940 | 61.749405 | 57.457184 | 3932.799722 | 5.731157 | 5.734526 | 3.538734 |
| std | 0.474011 | 1.432621 | 2.234491 | 3989.439738 | 1.121761 | 1.142135 | 0.705699 |
| min | 0.200000 | 43.000000 | 43.000000 | 326.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.400000 | 61.000000 | 56.000000 | 950.000000 | 4.710000 | 4.720000 | 2.910000 |
| 50% | 0.700000 | 61.800000 | 57.000000 | 2401.000000 | 5.700000 | 5.710000 | 3.530000 |
| 75% | 1.040000 | 62.500000 | 59.000000 | 5324.250000 | 6.540000 | 6.540000 | 4.040000 |
| max | 5.010000 | 79.000000 | 95.000000 | 18823.000000 | 10.740000 | 58.900000 | 31.800000 |
# counting the diamonds for each cut
diamonds_df.cut.value_counts()
Ideal 21551 Premium 13791 Very Good 12082 Good 4906 Fair 1610 Name: cut, dtype: int64
# counting the diamonds for each color
diamonds_df.color.value_counts()
G 11292 E 9797 F 9542 H 8304 D 6775 I 5422 J 2808 Name: color, dtype: int64
# counting the diamonds for each clarity
diamonds_df.clarity.value_counts()
SI1 13065 VS2 12258 SI2 9194 VS1 8171 VVS2 5066 VVS1 3655 IF 1790 I1 741 Name: clarity, dtype: int64
# a useful function that calculates the fraction of a clarity elemnt to all clarity elemnts
def fraction_clar(elemnt):
print(sum(diamonds_df.clarity == elemnt)/len(diamonds_df.clarity))
fraction_clar('IF')
fraction_clar('SI2')
0.03318502039302929 0.17044864664441972
# range of price [min, max]
range_of_price= print([diamonds_df.price.min(), diamonds_df.price.max()])
[326, 18823]
# accessing the data frame for the rows that have cut = ideal
diamonds_df[diamonds_df.cut == 'Ideal']
| carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 11 | 0.23 | Ideal | J | VS1 | 62.8 | 56.0 | 340 | 3.93 | 3.90 | 2.46 |
| 13 | 0.31 | Ideal | J | SI2 | 62.2 | 54.0 | 344 | 4.35 | 4.37 | 2.71 |
| 16 | 0.30 | Ideal | I | SI2 | 62.0 | 54.0 | 348 | 4.31 | 4.34 | 2.68 |
| 39 | 0.33 | Ideal | I | SI2 | 61.8 | 55.0 | 403 | 4.49 | 4.51 | 2.78 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53925 | 0.79 | Ideal | I | SI1 | 61.6 | 56.0 | 2756 | 5.95 | 5.97 | 3.67 |
| 53926 | 0.71 | Ideal | E | SI1 | 61.9 | 56.0 | 2756 | 5.71 | 5.73 | 3.54 |
| 53929 | 0.71 | Ideal | G | VS1 | 61.4 | 56.0 | 2756 | 5.76 | 5.73 | 3.53 |
| 53935 | 0.72 | Ideal | D | SI1 | 60.8 | 57.0 | 2757 | 5.75 | 5.76 | 3.50 |
| 53939 | 0.75 | Ideal | D | SI2 | 62.2 | 55.0 | 2757 | 5.83 | 5.87 | 3.64 |
21551 rows × 10 columns
# the average price for each cut quality
diamonds_df.groupby('cut').price.mean()
# it is clear that Ideal has the lowest average price in comparasion to the other cuts
# this indicates that the price doesn't fully depend on the cut,
# there are other factors that might effect the price, such as
# color, clarity, carat, etc..
cut Fair 4358.757764 Good 3928.864452 Ideal 3457.541970 Premium 4584.257704 Very Good 3981.759891 Name: price, dtype: float64
# the average price for each clarity
diamonds_df.groupby('clarity').price.mean()
clarity I1 3924.168691 IF 2864.839106 SI1 3996.001148 SI2 5063.028606 VS1 3839.455391 VS2 3924.989395 VVS1 2523.114637 VVS2 3283.737071 Name: price, dtype: float64
# this function is for printing the specific clarity and a specific cut
def my_pfunction(ycut, yclarity):
print(diamonds_df[((diamonds_df.cut == ycut) & (diamonds_df.clarity == yclarity))])
my_pfunction('Ideal', 'IF')
carat cut color clarity depth table price x y z 229 0.52 Ideal F IF 62.2 55.0 2783 5.14 5.18 3.21 250 0.55 Ideal G IF 60.9 57.0 2789 5.28 5.30 3.22 256 0.64 Ideal G IF 61.3 56.0 2790 5.54 5.58 3.41 313 0.61 Ideal G IF 62.3 56.0 2800 5.43 5.45 3.39 326 0.53 Ideal F IF 61.9 54.0 2802 5.22 5.25 3.24 ... ... ... ... ... ... ... ... ... ... ... 53606 0.71 Ideal I IF 61.7 59.0 2701 5.70 5.74 3.53 53790 0.54 Ideal F IF 62.2 54.0 2729 5.24 5.27 3.27 53792 0.51 Ideal F IF 62.5 54.0 2730 5.12 5.16 3.21 53811 0.52 Ideal F IF 61.7 57.0 2733 5.13 5.17 3.18 53887 0.52 Ideal F IF 61.5 57.0 2749 5.15 5.19 3.18 [1212 rows x 10 columns]
# this function is for printing the mean of a certain clarity and a certain cut
def my_function(xcut, xclarity):
print(diamonds_df[((diamonds_df.cut == xcut) & (diamonds_df.clarity== xclarity))].price.mean())
my_function('Ideal', 'SI2')
my_function('Ideal', 'IF')
my_function('Good', 'SI2')
my_function('G', 'IF')
4755.952655889146 2272.9133663366338 4580.260869565217 nan
#number of unique values in each column
for col in diamonds_df:
print(col, diamonds_df[col].nunique())
carat 273 cut 5 color 7 clarity 8 depth 184 table 127 price 11602 x 554 y 552 z 375
# price std Calculation
diamonds_df.price.std()
3989.4397381463023
# price mean Calculation
diamonds_df.price.mean()
3932.799721913237
# the lower and upper 95%
def limits(nums):
upperlimit = nums.mean() + (1.96* (nums.std()))
lowerlimit= nums.mean() - (1.96* (nums.std()))
lower_upper_limits =[lowerlimit, upperlimit]
return lower_upper_limits
limits(diamonds_df.price)
[-3886.5021648535153, 11752.10160867999]
#any value outside the interval is considered unusual value
sum(((diamonds_df.price > 11752.101) | (diamonds_df.price < -3886.502)))
3640
# values inside the interval
sum(((diamonds_df.price < 11752.101) & (diamonds_df.price > -3886.502)))
50300
print((3640/50300)*100)
#7.2% of the values are outside the interval
7.236580516898608
#lets say we want to add a size colomn which equals to the multplication of XYZ
#adding a column for size
diamonds_df['size']=diamonds_df['x'] * diamonds_df['y'] * diamonds_df['z']
#printing the dataframe after adding the size column
diamonds_df
| carat | cut | color | clarity | depth | table | price | x | y | z | size | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 | 38.202030 |
| 1 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 | 34.505856 |
| 2 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 | 38.076885 |
| 3 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 | 46.724580 |
| 4 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 | 51.917250 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53935 | 0.72 | Ideal | D | SI1 | 60.8 | 57.0 | 2757 | 5.75 | 5.76 | 3.50 | 115.920000 |
| 53936 | 0.72 | Good | D | SI1 | 63.1 | 55.0 | 2757 | 5.69 | 5.75 | 3.61 | 118.110175 |
| 53937 | 0.70 | Very Good | D | SI1 | 62.8 | 60.0 | 2757 | 5.66 | 5.68 | 3.56 | 114.449728 |
| 53938 | 0.86 | Premium | H | SI2 | 61.0 | 58.0 | 2757 | 6.15 | 6.12 | 3.74 | 140.766120 |
| 53939 | 0.75 | Ideal | D | SI2 | 62.2 | 55.0 | 2757 | 5.83 | 5.87 | 3.64 | 124.568444 |
53940 rows × 11 columns
#scince we added the size column, column x,y,and z will be droped
diamonds_df.drop(['x','y','z'], axis=1 , inplace = True)
diamonds_df
| carat | cut | color | clarity | depth | table | price | size | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 38.202030 |
| 1 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 34.505856 |
| 2 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 38.076885 |
| 3 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 46.724580 |
| 4 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 51.917250 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 53935 | 0.72 | Ideal | D | SI1 | 60.8 | 57.0 | 2757 | 115.920000 |
| 53936 | 0.72 | Good | D | SI1 | 63.1 | 55.0 | 2757 | 118.110175 |
| 53937 | 0.70 | Very Good | D | SI1 | 62.8 | 60.0 | 2757 | 114.449728 |
| 53938 | 0.86 | Premium | H | SI2 | 61.0 | 58.0 | 2757 | 140.766120 |
| 53939 | 0.75 | Ideal | D | SI2 | 62.2 | 55.0 | 2757 | 124.568444 |
53940 rows × 8 columns
#ploting clarity vs price in a barplot format from the highest price to the lowest
mean_clarity= diamonds_df.groupby('clarity')['price'].mean()
fig1 = px.bar(mean_clarity, title="Price for each clarity (Descending)")
fig1.update_layout(
xaxis_title="Clarity",
yaxis_title="Price",
barmode='stack', xaxis={'categoryorder':'total descending'}
)
fig1.show()
# it is clear from the graph that the clarity affect the price rapidly
#ploting clarity vs price in a boxplot format
y0 = 'clarity'
y1 = 'price'
df = px.data.tips()
fig = px.box(diamonds_df,x='clarity', y="price", title="Price for each clarity (Descending)")
fig.show()
#ploting color vs price in a barplot format from the highest price to the lowest
mean_color= diamonds_df.groupby('color')['price'].mean()
fig2 = px.bar(mean_color , title="Price for each color (Descending)")
fig2.update_layout(
xaxis_title="Color",
yaxis_title="Price",
barmode='stack', xaxis={'categoryorder':'total descending'}
)
fig2.show()
# it is clear from the graph that the color affects the price rapidly
#ploting cut vs price in a barplot format from the highest price to the lowest
mean_cut= diamonds_df.groupby('cut')['price'].mean()
fig3 = px.bar(mean_cut , title="Price for each cut (Descending)")
fig3.update_layout(
xaxis_title="Cut",
yaxis_title="Price",
barmode='stack', xaxis={'categoryorder':'total descending'}
)
fig3.show()